Get forecasts from COVID Hub

First fetch forecast dates for a bunch of forecasters. TODO (this is more of a general one to make this notebook better, and not specific to evalcast dev): we should do this more comprehensively. Pull all forecasters that have enough submissions to make useful comparisons. To do this, we can expose the function get_covidhub_forecaster_names().

forecasters = c("CMU-TimeSeries", 
                "YYG-ParamSearch", 
                "UMass-MechBayes", 
                "GT-DeepCOVID", 
                "IHME-CurveFit", 
                "LANL-GrowthRate", 
                "UCLA-SuEIR", 
                "MOBS-GLEAM_COVID", 
               "UT-Mobility", 
               "OliverWyman-Navigator", 
               "JHU_IDD-CovidSP", 
                "CovidAnalytics-DELPHI", 
             #   "Google_Harvard-CPF", Excluded for now. Doesn't have quantiles for all forecasts
                "Yu_Group-CLEP", 
                "COVIDhub-ensemble", 
                "COVIDhub-baseline")

# Get all forecast dates for these forecasters from COVID Hub
forecast_dates = vector("list", length = length(forecasters))
for (i in 1:length(forecasters)) {
  forecast_dates[[i]] = tryCatch({
    as_date(get_forecast_dates(forecasters[i]))
  },
  error = function(e) cat(sprintf("%i. %s\n", i, e$message))
  )
}

Load data from previous run so we don’t have to reingest / process it. This data could end up out of date if a forecast is retrospectively updated, but in that case it’s no longer a true prediction. We can always restart from scratch by deleting predictions_cards.rda.

if (file.exists("predictions_cards.rda")) {
  load(file = "predictions_cards.rda")
}
if(exists("predictions_cards")){
seen_dates = predictions_cards %>% 
  distinct(forecast_date, forecaster)
}

Now figure out “comparable” forecast dates: making a forecast on a Sunday or a Monday of the same epiweek should be comparable. TODO: we should switch over to using the Zoltar API, and soon it should have an “as of” parameter, so then we shouldn’t need to do this.

forecast_dates_cmu = forecast_dates[[1]]
# new_dates, as opposed to dates for which we already have data for a forecaster
new_dates = list()
for (i in 1:length(forecasters)) {
  given_dates = forecast_dates[[i]]
  # If the dates match exactly, or the given date falls on a Sunday and the
  # CMU date falls on a Monday of the same epiweek, then call it comparable...
  comparable_forecast_dates = given_dates[(given_dates %in% forecast_dates_cmu | 
                            ((given_dates + 1) %in% forecast_dates_cmu) &
                            wday(given_dates) == 1)]
  
  # ...but if there is an exact match on dates, ignore predictions made on the
  # previous day
  comparable_forecast_dates = comparable_forecast_dates[!((comparable_forecast_dates + 1) %in% comparable_forecast_dates)]
  if(exists("seen_dates")){
    if(forecasters[[i]] %in% seen_dates$forecaster){
      seen_forecaster_dates = (seen_dates %>% 
                                filter(forecaster == forecasters[[i]]))$forecast_date
      comparable_forecast_dates = as_date(setdiff(comparable_forecast_dates, seen_forecaster_dates))
    }
  }
  new_dates[[i]] = comparable_forecast_dates
}
names(new_dates) = forecasters

Now get predictions for each forecaster, looping over forecast dates from the CMU-TimeSeries model. TODO: this part is very very slow. Maybe (hopefully) by changing to use the Zoltar API, this will be much faster.

predictions_cards_list = vector("list", length = length(forecasters))
deaths_sig = "deaths_incidence_num"
cases_sig = "confirmed_incidence_num"
for (i in 1:length(forecasters)) {
  cat(forecasters[i], "...\n")
  if (length(new_dates[[i]] > 0)){
  predictions_cards_list[[i]] = tryCatch({
    get_covidhub_predictions(forecasters[i], 
                             rev(new_dates[[i]])) %>% 
                    filter(ahead < 5) %>% 
                    filter((nchar(geo_value) == 2 & signal == deaths_sig) |
                           (nchar(geo_value) == 5 & signal == cases_sig))
  },
  error = function(e) cat(e$message))
  }
}
## CMU-TimeSeries ...
## YYG-ParamSearch ...
## UMass-MechBayes ...
## GT-DeepCOVID ...
## IHME-CurveFit ...
## LANL-GrowthRate ...
## UCLA-SuEIR ...
## MOBS-GLEAM_COVID ...
## UT-Mobility ...
## OliverWyman-Navigator ...
## JHU_IDD-CovidSP ...
## CovidAnalytics-DELPHI ...
## Yu_Group-CLEP ...
## COVIDhub-ensemble ...
## COVIDhub-baseline ...
predictions_cards_new = bind_rows(predictions_cards_list)

# Combine old and new predictions cards
if(exists("predictions_cards")){
  predictions_cards = rbind(predictions_cards, predictions_cards_new)
} else {
  predictions_cards = predictions_cards_new
}

# Discard any forecasters without data
forecasters = unique(as.data.frame(predictions_cards)$forecaster)

On reflection: part of the problem here is that get_covidhub_predictions() downloads all predictions from COVID Hub and then filters them as needed. This makes the above extra slow because we end up downloading state and county forecasts, when we just want state forecasts. TODO: even before switching over to use the Zoltar API, we should redesign get_covidhub_predictions() so that it fetches from GitHub only the forecasts we specify, if possible.

Evaluate predictions

Hack: must change the response data source to be USAFacts, as JHU-CSSE data is currently unstable. TODO: we shouldn’t require evaluate_predictions() to have the response match what’s in the forecaster. If I train my forecaster on (say) JHU-CSSE data, then I should be able to evaluate it on USAFacts data.

predictions_cards$data_source = "usa-facts"
save(list = c("predictions_cards"), 
     file = "predictions_cards.rda", 
     compress = "xz")

Evaluate the predictions based on weighted interval score (WIS), absolute error (AE), and coverage of the central 80% prediction interval. TODO: we need to make sure evalcast “fails gracefully” in as many places as possible, and doesn’t throw an error (which would halt all execution) when it should be instead just throwing a warning and moving on. Basically I needed to artifically trim the forecast dates to have the latest one by “2020-09-28” so that I wouldn’t get errors with the aheads here. This is undesirable obviously …

Also, another TODO: just want to note that this is again very slow. Caching would help but I don’t think it’s the right solution. We’re fetching the exact same data something like 15 times in a row. If we simply changed the order of operations then we wouldn’t even need caching at all.

err_measures = list(wis = weighted_interval_score, ae = absolute_error,
                    cov_80 = interval_coverage(coverage = 0.8))

preds_to_eval = predictions_cards %>% 
                      filter(target_end_date < today())

preds_to_eval_state = preds_to_eval %>% 
                        filter(nchar(geo_value) == 2)
if (file.exists("score_cards_state.rda")){
  load("score_cards_state.rda")
}
if(exists("score_cards_state")){
  preds_to_eval_state = anti_join(preds_to_eval_state, 
                                  score_cards_state, 
                                  by = c("ahead", "forecaster", "forecast_date"))
}

if(nrow(preds_to_eval_state) > 0){
  score_cards_new_state = evaluate_predictions(preds_to_eval_state, 
                                              err_measures,
                                              backfill_buffer = 0)
} else {
  score_cards_new_state = data.frame()
}

if(exists("score_cards_state")){
  score_cards_state = rbind(score_cards_state, score_cards_new_state)
} else {
  score_cards_state = score_cards_new_state
}

preds_to_eval_county = preds_to_eval %>% 
                        filter(nchar(geo_value) == 5)
if (file.exists("score_cards_county.rda")){
  load("score_cards_county.rda")
}
if(exists("score_cards_county")){
  preds_to_eval_county = anti_join(preds_to_eval_county, 
                                  score_cards_county, 
                                  by = c("ahead", "forecaster", "forecast_date"))
}

if(nrow(preds_to_eval_county) > 0){
  evaluable_forecasters = (preds_to_eval_county %>% 
                            distinct(forecaster, quantile) %>% 
                            filter(!is.na(quantile)) %>% 
                            count(forecaster) %>%
                            filter(n > 1))$forecaster
  preds_to_eval_county = preds_to_eval_county %>% 
                           filter(forecaster %in% evaluable_forecasters)
}
if(nrow(preds_to_eval_county) > 0){
  score_cards_new_county = evaluate_predictions(preds_to_eval_county, 
                                              err_measures,
                                              backfill_buffer = 0)
} else {
  score_cards_new_county = data.frame()
}

if(exists("score_cards_county")){
  score_cards_county = rbind(score_cards_county, score_cards_new_county)
} else {
  score_cards_county = score_cards_new_county
}

As the above contained some pretty expensive steps, we save the results in an RData file (and we set eval = FALSE on this and all the above code chunks).

save(list = c("score_cards_county"), 
     file = "score_cards_county.rda", 
     compress = "xz")
save(list = c("score_cards_state"), 
     file = "score_cards_state.rda", 
     compress = "xz")
rm(list = setdiff(ls(), 
                  c("predictions_cards", 
                    "score_cards_county", 
                    "score_cards_state")))

Compute summary statistics.

load(file = "predictions_cards.rda")
load(file = "score_cards_county.rda")
load(file = "score_cards_state.rda")
score_cards_county_cmu = score_cards_county %>% 
                    filter(forecaster == "CMU-TimeSeries") %>% 
                    distinct(ahead, geo_value, target_end_date)

score_cards_county = semi_join(score_cards_county, score_cards_county_cmu)


score_cards_state_cmu = score_cards_state %>% 
                    filter(forecaster == "CMU-TimeSeries") %>% 
                    distinct(ahead, geo_value, target_end_date)
score_cards_state = semi_join(score_cards_state, score_cards_state_cmu)

recent_score_by_frcstr = score_cards_state %>% 
                           filter(target_end_date > today() - 28) %>%
                           group_by(forecaster, ahead) %>% 
                           summarize(num = sum(!is.na(wis))) %>%
                           pivot_wider(names_from = ahead, 
                                       names_prefix = "num_", 
                                       values_from = num)
recent_score_by_frcstr = recent_score_by_frcstr %>% 
                           mutate(num_total = num_1 + num_2 + num_3 + num_4) %>%
                           arrange(desc(num_total))
print(recent_score_by_frcstr, n = Inf)
## # A tibble: 13 x 6
## # Groups:   forecaster [13]
##    forecaster            num_1 num_2 num_3 num_4 num_total
##    <chr>                 <int> <int> <int> <int>     <int>
##  1 CMU-TimeSeries         4896  4896  4896  4896     19584
##  2 CovidAnalytics-DELPHI  4896  4896  4896  4896     19584
##  3 COVIDhub-baseline      4896  4896  4896  4896     19584
##  4 COVIDhub-ensemble      4896  4896  4896  4896     19584
##  5 JHU_IDD-CovidSP        4896  4896  4896  4896     19584
##  6 LANL-GrowthRate        4896  4896  4896  4896     19584
##  7 OliverWyman-Navigator  4896  4896  4896  4896     19584
##  8 UCLA-SuEIR             4896  4896  4896  4896     19584
##  9 UMass-MechBayes        4896  4896  4896  4896     19584
## 10 UT-Mobility            4896  4896  4896  4896     19584
## 11 MOBS-GLEAM_COVID       4800  4800  4800  4800     19200
## 12 GT-DeepCOVID           4584  4464  4392  4344     17784
## 13 IHME-CurveFit          2448  1224  1224  1224      6120
cmu_forecasts = recent_score_by_frcstr[recent_score_by_frcstr$forecaster == "CMU-TimeSeries",]$num_total

active_forecasters = recent_score_by_frcstr[recent_score_by_frcstr$num_total > cmu_forecasts * 0.80,]$forecaster
# CovidAnalytics-DELPHI and IHME-CurveFit didn't submit that many forecasts ...
score_cards_state = score_cards_state %>% 
                    filter(forecaster %in% active_forecasters)

recent_score_by_frcstr = score_cards_county %>% 
                           filter(target_end_date > today() - 28) %>%
                           group_by(forecaster, ahead) %>% 
                           summarize(num = sum(!is.na(wis))) %>%
                           pivot_wider(names_from = ahead, 
                                       names_prefix = "num_", 
                                       values_from = num)
recent_score_by_frcstr = recent_score_by_frcstr %>% 
                           mutate(num_total = num_1 + num_2 + num_3 + num_4) %>%
                           arrange(desc(num_total))
print(recent_score_by_frcstr, n = Inf)
## # A tibble: 7 x 6
## # Groups:   forecaster [7]
##   forecaster        num_1 num_2 num_3 num_4 num_total
##   <chr>             <int> <int> <int> <int>     <int>
## 1 CMU-TimeSeries     6400  6400  6400  6400     25600
## 2 COVIDhub-baseline  6400  6400  6400  6400     25600
## 3 COVIDhub-ensemble  6400  6400  6400  6400     25600
## 4 UCLA-SuEIR         6400  6400  6400  6400     25600
## 5 LANL-GrowthRate    6368  6368  6368  6336     25440
## 6 UMass-MechBayes    6248  6160  6192  6240     24840
## 7 JHU_IDD-CovidSP    6400  4800  4800  4800     20800
cmu_forecasts = recent_score_by_frcstr[recent_score_by_frcstr$forecaster == "CMU-TimeSeries",]$num_total

active_forecasters = recent_score_by_frcstr[recent_score_by_frcstr$num_total > cmu_forecasts * 0.80,]$forecaster
# CovidAnalytics-DELPHI and IHME-CurveFit didn't submit that many forecasts ...
score_cards_county = score_cards_county %>% 
                    filter(forecaster %in% active_forecasters)
# TODO: Use evalcast::intersect_averagers after merging the kill-cards updates
forecaster_coverage = score_cards_county %>% 
                        filter(quantile == 0.5) %>% 
                        group_by(ahead, geo_value, target_end_date) %>% 
                        summarise(num_forecasts = n())
max_forecasts = max(forecaster_coverage$num_forecasts)
table(forecaster_coverage$num_forecasts)
## 
##     4     5     6     7 
##    64    80  1040 12012
common_forecasts_county = forecaster_coverage %>% 
                            filter(num_forecasts == max_forecasts) %>%
                            select(ahead, geo_value, target_end_date)

score_cards_county = semi_join(score_cards_county, common_forecasts_county)

forecaster_coverage = score_cards_state %>% 
                        filter(quantile == 0.5) %>% 
                        group_by(ahead, geo_value, target_end_date) %>% 
                        summarise(num_forecasts = n())
max_forecasts = max(forecaster_coverage$num_forecasts)
table(forecaster_coverage$num_forecasts)
## 
##    5    9   10   11   12 
##  204   68  676 1272 1714
common_forecasts_state = forecaster_coverage %>% 
                            filter(num_forecasts == max_forecasts) %>%
                            select(ahead, geo_value, target_end_date)


score_cards_state = semi_join(score_cards_state, common_forecasts_state)

Dot plots

First we make dot plots (not the same as that in evalcast): one dot per ahead, forecaster, and forecast date. The red plus marks the score computed over all dates. Here we use the mean as the aggregator function, and we study WIS, AE, and coverage-80.

# Define mean and median functions that deal with missingness well
Mean = function(x) mean(x, na.rm = TRUE)
Median = function(x) median(x, na.rm = TRUE)

summarize_var = function(df, var, aggr = Mean) {
  df_by_date = df %>% 
    group_by(forecaster, ahead, target_end_date) %>%
    summarize(var = aggr(!!as.symbol(var))) %>%
    ungroup()
  df_overall = df %>%
    group_by(forecaster, ahead) %>%
    summarize(var_overall = aggr(!!as.symbol(var))) %>%
    ungroup() %>% group_by(ahead) %>%
    arrange(var_overall, .by_group = TRUE) %>%
    ungroup() %>%
    mutate(order = row_number())
  df_sum = full_join(df_by_date, df_overall, by = c("forecaster", "ahead"))
}

dot_plot = function(df, var = "wis", ylab = var, ylim = NULL, aggr = Mean) {
  df_sum = summarize_var(df, var, aggr)
  df_sum$ahead = factor(paste("ahead =", df_sum$ahead))
  
  ggplot(df_sum, aes(x = order, y = var)) +
    geom_point(aes(color = target_end_date)) + 
    geom_point(aes(x = order, y = var_overall), color = "red", shape = 3) +
    facet_wrap(vars(ahead), scales = "free") + 
    labs(x = "Forecaster", y = ylab) +
    scale_x_continuous(breaks = df_sum$order, labels = df_sum$forecaster) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
    coord_cartesian(ylim = ylim)
}

States

dot_plot(score_cards_state, var = "wis", ylab = "Mean WIS") + scale_y_log10()

dot_plot(score_cards_state, var = "ae", ylab = "Mean AE") + scale_y_log10()

dot_plot(score_cards_state, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
  geom_hline(yintercept = 0.8)

Counties

dot_plot(score_cards_county, var = "wis", ylab = "Mean WIS") + scale_y_log10()

dot_plot(score_cards_county, var = "ae", ylab = "Mean AE") + scale_y_log10()

dot_plot(score_cards_county, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
  geom_hline(yintercept = 0.8)

Dot plots: median and 90th percentile

Same as before, but change the aggregator function to the median. Omitting AE for brevity, hencforth.

States

dot_plot(score_cards_state, var = "wis", ylab = "Median WIS", aggr = Median) +
  scale_y_log10()

Counties

dot_plot(score_cards_county, var = "wis", ylab = "Median WIS", aggr = Median) +
  scale_y_log10()

And now we change the aggregator function to be the 90th percentile.

States

dot_plot(score_cards_state, var = "wis", ylab = "90th percentile WIS", 
         aggr = function(x) quantile(x, prob = 0.9, na.rm = TRUE)) +
  scale_y_log10()

Counties

dot_plot(score_cards_county, var = "wis", ylab = "90th percentile WIS", 
         aggr = function(x) quantile(x, prob = 0.9, na.rm = TRUE)) +
  scale_y_log10()

Line plots

Now we make line plots: one line per ahead and forecaster, as a function of f orecast date. Here we use the mean as the aggregator function, and we look at WIS and coverage-80.

# From https://stackoverflow.com/questions/15282580/
color_picker = function(n) {
  qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
  unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
}

line_plot = function(df, var = "wis", ylab = var, ylim = NULL, aggr = Mean) {
  df_sum = summarize_var(df, var, aggr)
  df_sum$ahead = factor(paste("ahead =", df_sum$ahead))
  
  ggplot(df_sum, aes(x = target_end_date, y = var)) +
    geom_line(aes(color = forecaster, linetype = forecaster)) +
    geom_point(aes(color = forecaster)) +
    facet_wrap(vars(ahead), scales = "free") + 
    labs(x = "Date", y = ylab) +
    coord_cartesian(ylim = ylim) +
    scale_color_manual(values = color_picker(length(unique(forecaster))))
}

States

line_plot(score_cards_state, var = "wis", ylab = "Mean WIS") + scale_y_log10() 

line_plot(score_cards_state, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
  geom_hline(yintercept = 0.8)

Counties

line_plot(score_cards_county, var = "wis", ylab = "Mean WIS") + scale_y_log10() 

line_plot(score_cards_county, var = "cov_80", ylab = "Coverage-80", ylim = c(0,1)) +
  geom_hline(yintercept = 0.8)

Scaling by baseline

We scale each score, per location and forecast date, by the COVIDhub-baseline score; then we take the mean or median.

Important note on the order of operations here: scale then aggregate. The other way around: aggregate then scale, would be a simple post-adjustment applied to the metrics we computed earlier. This way: scale then aggregate, results in a different final metric altogether. It is potentially interesting as it provides a nonparametric spatiotemporal adjustment; assuming that space and time effects are multiplicative, we’re directly “canceling them out” by taking ratios.

Here are dot plots for scaled WIS, with mean as the aggregator. Omitting median for brevity.

# Note to self: mutate_at() gave me a weird bug below! From now on, better use
# mutate() with across() instead ...
scale_df = function(df, var, base_forecaster = "COVIDhub-baseline") {
  df %>% select(-c(quantile, value, forecast_date)) %>%
    distinct() %>%
    pivot_wider(id_cols = c(geo_value, target_end_date, ahead),
                names_from = "forecaster", names_prefix = var, 
                values_from = var) %>%
    mutate(across(starts_with(var), ~ .x /
                !!as.symbol(paste0(var, base_forecaster)))) %>%
    pivot_longer(cols = starts_with(var), names_to = "forecaster",
                 values_to = "scaled") %>%
    mutate(forecaster = substring(forecaster, nchar(var) + 1)) %>%
    filter(forecaster != base_forecaster)
}

States

dot_plot(scale_df(score_cards_state, var = "wis"), var = "scaled", 
         ylab = "Mean scaled WIS") + geom_hline(yintercept = 1) 

Counties

dot_plot(scale_df(score_cards_county, var = "wis"), var = "scaled", 
         ylab = "Mean scaled WIS") + geom_hline(yintercept = 1) 

Here are now line plots for mean scaled WIS.

States

line_plot(scale_df(score_cards_state, var = "wis"), var = "scaled", 
          ylab = "Mean scaled WIS") + geom_hline(yintercept = 1) 

Counties

line_plot(scale_df(score_cards_county, var = "wis"), var = "scaled", 
          ylab = "Mean scaled WIS") + geom_hline(yintercept = 1) 

Centering by baseline

Similar to what we did previously but just with centering instead of scaling.

Note on order of operations: center then aggregate versus aggregate then center are still in general different strategies. As before we’re adhering to the first way, with a similar movitation: if space and time effects were now additive, then this way would “cancel them out” directly. However, when the aggregator is a linear operation (e.g., mean), the two strategies essentially reduce to the same thing (“essentially”, not exactl, because setting na.rm = TRUE generally turns any linear operator into a nonlinear one).

Here are the dot plots for mean centered WIS. Omitting median for brevity.

center_df = function(df, var, base_forecaster = "COVIDhub-baseline") {
  scale_df(df %>% mutate(y = exp(!!as.symbol(var))), "y", base_forecaster) %>%
    mutate(centered = log(scaled)) %>% select(-scaled)
}

States

dot_plot(center_df(score_cards_state, var = "wis"), var = "centered", 
         ylab = "Mean centered WIS") + geom_hline(yintercept = 0)  

Counties

dot_plot(center_df(score_cards_county, var = "wis"), var = "centered", 
         ylab = "Mean centered WIS") + geom_hline(yintercept = 0) 

Here are now the line plots for mean centered WIS.

States

line_plot(center_df(score_cards_state, var = "wis"), var = "centered", 
          ylab = "Mean centered WIS") + geom_hline(yintercept = 0) 

Counties

line_plot(center_df(score_cards_county, var = "wis"), var = "centered", 
          ylab = "Mean centered WIS") + geom_hline(yintercept = 0) 

Pairwise tournament

We run a pairwise tournament. This is inspired by Johannes Bracher’s analysis (and similar ideas in the literature). Except, the order of operations here is different: scale then aggregate (whereas Johannes did: aggregate then scale). The motivation for this was explained above (thinking of it as providing a nonparametric spatiotemporal adjustment), as was the fact that the order of operations really makes a difference.

For each pair of forecasters \(f\) and \(g\), we compute:

\[ \theta_{fg} = A\bigg\{ \frac{S(f;\ell,d,a)}{S(g;\ell,d,a)} \;:\; \text{common locations $\ell$, forecast dates $d$, and ahead values $a$} \bigg\} \]

where \(S\) is a score of interest, say WIS, and \(A\) is an aggregator of interest, say the mean. Important note: we aggregate over all common locations, dates, and ahead values, which may differ for each pair \(f,g\). To compute an overall metric for forecaster \(f\), we use:

\[ \theta_f = \bigg( \prod_g \theta_{fg} \bigg)^{1/F}. \]

the geometric mean of all pairwise comparisons of \(f\) to other forecasters (here \(F\) is the total number of forecasters). Another interesting option would be to define \((\theta_f)_{f \in F}\) as the top left singular vector of the matrix \((\theta_{fg})_{f,g \in F}\), which we’ll also investigate.

pairwise_tournament = function(df, var, aggr = Mean) {
  forecasters = unique(df$forecaster)
  theta_mat = matrix(NA, length(forecasters), length(forecasters))
  rownames(theta_mat) = colnames(theta_mat) = forecasters
  for (f in forecasters) {
    result =  scale_df(df, var, base_forecaster = f) %>% 
      group_by(forecaster) %>%
      summarize(v = aggr(scaled))
    theta_mat[result$forecaster, f] = result$v
  }
  
  # Convert to data frame for convenience with ggplot
  theta_df = as.data.frame(theta_mat) %>%
    mutate(Forecaster1 = forecasters) %>%
    pivot_longer(cols = -Forecaster1, names_to = "Forecaster2",
                 values_to = "value")
  
  # Compute overall metrics two ways: geometric mean, SVD
  theta_vec1 = exp(rowMeans(log(theta_mat), na.rm = TRUE))
  diag(theta_mat) = 1 # so the SVD won't fail; undo it later
  theta_vec2 = as.numeric(svd(theta_mat, nu = 1)$u)
  names(theta_vec2) = names(theta_vec1)
  diag(theta_mat) = NA
  
  return(list(mat = theta_mat, df = theta_df, vec1 = theta_vec1, 
              vec2 = theta_vec2))
}

States

theta_state = pairwise_tournament(score_cards_state, var = "wis", aggr = 
                              function(x) mean(x, trim = 0.01, na.rm = TRUE))
ranked_list = rownames(theta_state$mat)[order(theta_state$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_state$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_state$vec1), forecaster = ranked_list,
                        theta_state = sort(theta_state$vec1), row.names = NULL))
rank forecaster theta_state
1 COVIDhub-ensemble 0.9974964
2 OliverWyman-Navigator 1.1005163
3 UMass-MechBayes 1.1615139
4 CMU-TimeSeries 1.1855710
5 GT-DeepCOVID 1.2488660
6 MOBS-GLEAM_COVID 1.3583980
7 COVIDhub-baseline 1.4393616
8 LANL-GrowthRate 1.4731522
9 CovidAnalytics-DELPHI 1.6577773
10 JHU_IDD-CovidSP 1.8660757
11 UCLA-SuEIR 2.1753458
12 UT-Mobility 2.2269679

Counties

theta_county = pairwise_tournament(score_cards_county, var = "wis", aggr = 
                              function(x) mean(x, trim = 0.01, na.rm = TRUE))

ranked_list = rownames(theta_county$mat)[order(theta_county$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_county$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_county$vec1), forecaster = ranked_list,
                        theta_county = sort(theta_county$vec1), row.names = NULL))
rank forecaster theta_county
1 COVIDhub-ensemble 0.8974899
2 CMU-TimeSeries 0.9498551
3 COVIDhub-baseline 1.1081821
4 LANL-GrowthRate 1.2206064
5 UMass-MechBayes 1.2817611
6 UCLA-SuEIR 1.7641711
7 JHU_IDD-CovidSP 2.3352154

For curiosity, we can plot the agreement the overall metric computed via GM and SVD. The agreement is basically perfect!

State

ggplot() + geom_point(aes(x = theta_state$vec1, y = theta_state$vec2)) +
  labs(x = "Geometric mean", y = "Top left singular vector")

County

ggplot() + geom_point(aes(x = theta_county$vec1, y = theta_county$vec2)) +
  labs(x = "Geometric mean", y = "Top left singular vector")

Pairwise tournament: median and 90th percentile

Repeat the same pairwise tournament but with the median as the aggregator.

State

theta_state = pairwise_tournament(score_cards_state, var = "wis", aggr = Median)

ranked_list = rownames(theta_state$mat)[order(theta_state$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_state$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_state$vec1), forecaster = ranked_list,
                        theta_state = sort(theta_state$vec1), row.names = NULL))
rank forecaster theta_state
1 OliverWyman-Navigator 0.7354122
2 UMass-MechBayes 0.7649155
3 COVIDhub-ensemble 0.7766160
4 GT-DeepCOVID 0.8867322
5 CMU-TimeSeries 0.8972306
6 MOBS-GLEAM_COVID 0.9104654
7 COVIDhub-baseline 1.0554657
8 LANL-GrowthRate 1.0717744
9 CovidAnalytics-DELPHI 1.1642852
10 UT-Mobility 1.1956046
11 JHU_IDD-CovidSP 1.2309344
12 UCLA-SuEIR 1.6302796

County

theta_county = pairwise_tournament(score_cards_county, var = "wis", aggr = Median)

ranked_list = rownames(theta_county$mat)[order(theta_county$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_county$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_county$vec1), forecaster = ranked_list,
                        theta_county = sort(theta_county$vec1), row.names = NULL))
rank forecaster theta_county
1 CMU-TimeSeries 0.7054242
2 COVIDhub-ensemble 0.7678441
3 COVIDhub-baseline 0.8502592
4 UMass-MechBayes 0.9350271
5 LANL-GrowthRate 0.9755181
6 UCLA-SuEIR 1.3357642
7 JHU_IDD-CovidSP 1.7821159

And now with the 90th percentile as the aggregator.

State

theta_state = pairwise_tournament(score_cards_state, var = "wis", aggr =
                              function(x) quantile(x, prob = 0.9, na.rm = TRUE))

ranked_list = rownames(theta_state$mat)[order(theta_state$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_state$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_state$vec1), forecaster = ranked_list,
                        theta_state = sort(theta_state$vec1), row.names = NULL))
rank forecaster theta_state
1 COVIDhub-ensemble 1.951550
2 OliverWyman-Navigator 2.431646
3 CMU-TimeSeries 2.476555
4 UMass-MechBayes 2.485457
5 GT-DeepCOVID 2.639638
6 COVIDhub-baseline 2.988444
7 MOBS-GLEAM_COVID 3.027954
8 LANL-GrowthRate 3.157926
9 CovidAnalytics-DELPHI 3.661705
10 JHU_IDD-CovidSP 4.171476
11 UCLA-SuEIR 4.716802
12 UT-Mobility 5.547086

County

theta_county = pairwise_tournament(score_cards_county, var = "wis", aggr =
                              function(x) quantile(x, prob = 0.9, na.rm = TRUE))

ranked_list = rownames(theta_county$mat)[order(theta_county$vec1)]
colors = colorRampPalette(brewer.pal(n = 6, name = "RdBu"))(30)
ggplot(theta_county$df, aes(x = factor(Forecaster2, levels = rev(ranked_list)),
                     y = factor(Forecaster1, levels = rev(ranked_list)))) +
  geom_tile(aes(fill = value)) +
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradientn(colours = colors) +
  labs(x = NULL, y = NULL) +
  theme_bw() + theme(legend.position = "none", 
                     axis.text.x = element_text(angle = 90, hjust = 1))

# Overall metric (computed via GM of pairwise metrics):
knitr::kable(data.frame(rank = 1:length(theta_county$vec1), forecaster = ranked_list,
                        theta_county = sort(theta_county$vec1), row.names = NULL))
rank forecaster theta_county
1 COVIDhub-ensemble 1.619374
2 CMU-TimeSeries 1.926362
3 COVIDhub-baseline 2.225244
4 LANL-GrowthRate 2.356041
5 UMass-MechBayes 2.620181
6 UCLA-SuEIR 3.670698
7 JHU_IDD-CovidSP 4.722541